aboutsummaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h124
1 files changed, 113 insertions, 11 deletions
diff --git a/llama.h b/llama.h
index 4596b1e..d237bcc 100644
--- a/llama.h
+++ b/llama.h
@@ -53,6 +53,10 @@
#define LLAMA_SUPPORTS_GPU_OFFLOAD
#endif
+#ifndef LLAMA_DEFAULT_RMS_EPS
+#define LLAMA_DEFAULT_RMS_EPS 5e-6f
+#endif
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -82,13 +86,34 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
- struct llama_context_params {
- uint32_t seed; // RNG seed, -1 for random
- int32_t n_ctx; // text context
- int32_t n_batch; // prompt processing batch size
- int32_t n_gpu_layers; // number of layers to store in VRAM
- int32_t main_gpu; // the GPU that is used for scratch and small tensors
- float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
+ enum llama_log_level {
+ LLAMA_LOG_LEVEL_ERROR = 2,
+ LLAMA_LOG_LEVEL_WARN = 3,
+ LLAMA_LOG_LEVEL_INFO = 4
+ };
+
+ // Signature for logging events
+ // Note that text includes the new line character at the end for most events.
+ // If your logging mechanism cannot handle that, check if the last character is '\n' and strip it
+ // if it exists.
+ // It might not exist for progress report where '.' is output repeatedly.
+ typedef void (*llama_log_callback)(llama_log_level level, const char * text, void * user_data);
+
+ struct llama_context_params {
+ uint32_t seed; // RNG seed, -1 for random
+ int32_t n_ctx; // text context
+ int32_t n_batch; // prompt processing batch size
+ int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams)
+ float rms_norm_eps; // rms norm epsilon (TEMP - will be moved to model hparams)
+ int32_t n_gpu_layers; // number of layers to store in VRAM
+ int32_t main_gpu; // the GPU that is used for scratch and small tensors
+
+ const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
+
+ // ref: https://github.com/ggerganov/llama.cpp/pull/2054
+ float rope_freq_base; // RoPE base frequency
+ float rope_freq_scale; // RoPE frequency scaling factor
+
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
@@ -96,6 +121,7 @@ extern "C" {
// Keep the booleans together to avoid misalignment during copy-by-value.
bool low_vram; // if true, reduce VRAM usage at the cost of performance
+ bool mul_mat_q; // if true, use experimental mul_mat_q kernels
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
@@ -134,6 +160,40 @@ extern "C" {
bool quantize_output_tensor; // quantize output.weight
} llama_model_quantize_params;
+ // grammar types
+ struct llama_grammar;
+
+ // grammar element type
+ enum llama_gretype {
+ // end of rule definition
+ LLAMA_GRETYPE_END = 0,
+
+ // start of alternate definition for rule
+ LLAMA_GRETYPE_ALT = 1,
+
+ // non-terminal element: reference to rule
+ LLAMA_GRETYPE_RULE_REF = 2,
+
+ // terminal element: character (code point)
+ LLAMA_GRETYPE_CHAR = 3,
+
+ // inverse char(s) ([^a], [^a-b] [^abc])
+ LLAMA_GRETYPE_CHAR_NOT = 4,
+
+ // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
+ // be an inclusive range ([a-z])
+ LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
+
+ // modifies a preceding LLAMA_GRETYPE_CHAR or
+ // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
+ LLAMA_GRETYPE_CHAR_ALT = 6,
+ };
+
+ typedef struct llama_grammar_element {
+ enum llama_gretype type;
+ uint32_t value; // Unicode code point or rule ID
+ } llama_grammar_element;
+
// performance timing information
struct llama_timings {
double t_start_ms;
@@ -148,6 +208,12 @@ extern "C" {
int32_t n_eval;
};
+ // Set callback for all future logging events.
+ // If this is not called, or NULL is supplied, everything is output on stderr.
+ LLAMA_API void llama_log_set(llama_log_callback log_callback, void * user_data);
+
+ LLAMA_API int llama_max_devices();
+
LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params();
@@ -270,10 +336,21 @@ extern "C" {
int n_max_tokens,
bool add_bos);
+ LLAMA_API int llama_tokenize_with_model(
+ const struct llama_model * model,
+ const char * text,
+ llama_token * tokens,
+ int n_max_tokens,
+ bool add_bos);
+
LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
+ LLAMA_API int llama_n_vocab_from_model(const struct llama_model * model);
+ LLAMA_API int llama_n_ctx_from_model (const struct llama_model * model);
+ LLAMA_API int llama_n_embd_from_model (const struct llama_model * model);
+
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
@@ -282,6 +359,12 @@ extern "C" {
float * scores,
int capacity);
+ LLAMA_API int llama_get_vocab_from_model(
+ const struct llama_model * model,
+ const char * * strings,
+ float * scores,
+ int capacity);
+
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
@@ -294,13 +377,28 @@ extern "C" {
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
- LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);
+ LLAMA_API const char * llama_token_to_str(
+ const struct llama_context * ctx,
+ llama_token token);
+
+ LLAMA_API const char * llama_token_to_str_with_model(
+ const struct llama_model * model,
+ llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos(); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(); // end-of-sentence
LLAMA_API llama_token llama_token_nl(); // next-line
+ // Grammar
+ //
+ LLAMA_API struct llama_grammar * llama_grammar_init(
+ const llama_grammar_element ** rules,
+ size_t n_rules,
+ size_t start_rule_index);
+
+ LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
+
// Sampling functions
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
@@ -313,13 +411,11 @@ extern "C" {
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.
/// @params guidance_ctx A separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @params scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
- /// @params smooth_factor Smooth factor between guidance logits and original logits. 1.0f means only use guidance logits. 0.0f means only original logits.
LLAMA_API void llama_sample_classifier_free_guidance(
struct llama_context * ctx,
llama_token_data_array * candidates,
struct llama_context * guidance_ctx,
- float scale,
- float smooth_factor);
+ float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
@@ -337,6 +433,9 @@ extern "C" {
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+ /// @details Apply constraints from grammar
+ LLAMA_API void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar);
+
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@@ -358,6 +457,9 @@ extern "C" {
/// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
+ /// @details Accepts the sampled token into the grammar
+ LLAMA_API void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token);
+
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);