aboutsummaryrefslogtreecommitdiff
path: root/llama.h
diff options
context:
space:
mode:
authorEttore Di Giacinto <mudler@users.noreply.github.com>2023-06-20 03:24:39 +0200
committerGitHub <noreply@github.com>2023-06-20 04:24:39 +0300
commitaacdbd40562684665b6f7b8ba6695b7a2088bbb0 (patch)
treeb9fe34e235b99d4c5ae07ee5c07f9375c86c9d17 /llama.h
parent20568fe60f00155fa25e92eb3a7f6b911d557967 (diff)
llama : fix params struct slignment (#1936)
* Workaround struct misalignment during value-copy Signed-off-by: mudler <mudler@localai.io> * Move booleans at the bottom of the structure Signed-off-by: mudler <mudler@localai.io> * Add comment Signed-off-by: mudler <mudler@localai.io> --------- Signed-off-by: mudler <mudler@localai.io>
Diffstat (limited to 'llama.h')
-rw-r--r--llama.h17
1 files changed, 8 insertions, 9 deletions
diff --git a/llama.h b/llama.h
index 1241ba6..0de530d 100644
--- a/llama.h
+++ b/llama.h
@@ -71,28 +71,27 @@ extern "C" {
typedef void (*llama_progress_callback)(float progress, void *ctx);
- struct llama_context_params {
+ struct llama_context_params {
+ int seed; // RNG seed, -1 for random
int n_ctx; // text context
int n_batch; // prompt processing batch size
int n_gpu_layers; // number of layers to store in VRAM
int main_gpu; // the GPU that is used for scratch and small tensors
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
- bool low_vram; // if true, reduce VRAM usage at the cost of performance
- int seed; // RNG seed, -1 for random
+ // called with a progress value between 0 and 1, pass NULL to disable
+ llama_progress_callback progress_callback;
+ // context pointer passed to the progress callback
+ void * progress_callback_user_data;
+ // Keep the booleans together to avoid misalignment during copy-by-value.
+ bool low_vram; // if true, reduce VRAM usage at the cost of performance
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
-
- // called with a progress value between 0 and 1, pass NULL to disable
- llama_progress_callback progress_callback;
- // context pointer passed to the progress callback
- void * progress_callback_user_data;
};
-
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,