aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md2
-rw-r--r--examples/common.cpp3
-rw-r--r--examples/main/README.md2
-rw-r--r--examples/server/README.md2
-rw-r--r--examples/server/server.cpp3
-rw-r--r--llama-util.h6
-rw-r--r--llama.cpp18
7 files changed, 22 insertions, 14 deletions
diff --git a/README.md b/README.md
index 63457b6..476cc43 100644
--- a/README.md
+++ b/README.md
@@ -239,7 +239,7 @@ In order to build llama.cpp you have three different options.
- Using `Zig`:
```bash
- zig build -Drelease-fast
+ zig build -Doptimize=ReleaseFast
```
### Metal Build
diff --git a/examples/common.cpp b/examples/common.cpp
index 93159c6..fad1688 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -267,7 +267,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.lora_adapter = argv[i];
- params.use_mmap = false;
} else if (arg == "--lora-base") {
if (++i >= argc) {
invalid_param = true;
@@ -499,7 +498,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --mtest compute maximum memory usage\n");
fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n");
fprintf(stderr, " --verbose-prompt print prompt before generation\n");
- fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
+ fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
diff --git a/examples/main/README.md b/examples/main/README.md
index 3753861..04b8d54 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -293,5 +293,5 @@ These options provide extra functionality and customization when running the LLa
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
- `-ts SPLIT, --tensor-split SPLIT`: When using multiple GPUs this option controls how large tensors should be split across all GPUs. `SPLIT` is a comma-separated list of non-negative values that assigns the proportion of data that each GPU should get in order. For example, "3,2" will assign 60% of the data to GPU 0 and 40% to GPU 1. By default the data is split in proportion to VRAM but this may not be optimal for performance. Requires cuBLAS.
- `-lv, --low-vram`: Do not allocate a VRAM scratch buffer for holding temporary results. Reduces VRAM usage at the cost of performance, particularly prompt processing speed. Requires cuBLAS.
-- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
diff --git a/examples/server/README.md b/examples/server/README.md
index ad9b6bb..3691abd 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -16,7 +16,7 @@ Command line options:
- `--memory-f32`: Use 32-bit floats instead of 16-bit floats for memory key+value. Not recommended.
- `--mlock`: Lock the model in memory, preventing it from being swapped out when memory-mapped.
- `--no-mmap`: Do not memory-map the model. By default, models are mapped into memory, which allows the system to load only the necessary parts of the model as needed.
-- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model (implies --no-mmap). This allows you to adapt the pretrained model to specific tasks or domains.
+- `--lora FNAME`: Apply a LoRA (Low-Rank Adaptation) adapter to the model. This allows you to adapt the pretrained model to specific tasks or domains.
- `--lora-base FNAME`: Optional model to use as a base for the layers modified by the LoRA adapter. This flag is used in conjunction with the `--lora` flag, and specifies the base model for the adaptation.
- `-to N`, `--timeout N`: Server read/write timeout in seconds. Default `600`.
- `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 296c5d6..4114343 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -632,7 +632,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, " -a ALIAS, --alias ALIAS\n");
fprintf(stderr, " set an alias for the model, will be added as `model` field in completion response\n");
- fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n");
+ fprintf(stderr, " --lora FNAME apply LoRA adapter\n");
fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n");
fprintf(stderr, " --host ip address to listen (default (default: %s)\n", sparams.hostname.c_str());
fprintf(stderr, " --port PORT port to listen (default (default: %d)\n", sparams.port);
@@ -820,7 +820,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
break;
}
params.lora_adapter = argv[i];
- params.use_mmap = false;
}
else if (arg == "--lora-base")
{
diff --git a/llama-util.h b/llama-util.h
index 042ebe4..43b6f05 100644
--- a/llama-util.h
+++ b/llama-util.h
@@ -175,13 +175,13 @@ struct llama_mmap {
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
size = file->size;
int fd = fileno(file->fp);
- int flags = MAP_SHARED;
+ int flags = MAP_PRIVATE;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
#endif
- addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
+ addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
@@ -223,7 +223,7 @@ struct llama_mmap {
throw std::runtime_error(format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()));
}
- addr = MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, 0);
+ addr = MapViewOfFile(hMapping, FILE_MAP_COPY, 0, 0, 0);
error = GetLastError();
CloseHandle(hMapping);
diff --git a/llama.cpp b/llama.cpp
index ad7283f..08ec21a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2454,15 +2454,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
} else {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
+ bool convert_incompatible_tensor = false;
if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K ||
quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) {
int nx = tensor.ne.at(0);
int ny = tensor.ne.at(1);
if (nx % QK_K != 0 || ny % QK_K != 0) {
- fprintf(stderr, "\n\n========================= Tensor sizes %d x %d are not divisible by %d\n",nx,ny,QK_K);
- fprintf(stderr, "This is required to be able to use k-quants for now!\n");
- fprintf(stderr, "========================================================================================\n\n");
- throw std::runtime_error("Unsupported tensor size encountered\n");
+ fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K);
+ convert_incompatible_tensor = true;
}
}
if (tensor.name == "output.weight") {
@@ -2490,6 +2489,17 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
}
+ if (convert_incompatible_tensor) {
+ if (tensor.name == "output.weight") {
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
+ fprintf(stderr, "F16 will be used for this tensor instead.\n");
+ } else if (tensor.name == "tok_embeddings.weight") {
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
+ fprintf(stderr, "Q4_0 will be used for this tensor instead.\n");
+ } else {
+ throw std::runtime_error("Unsupported tensor size encountered\n");
+ }
+ }
#endif
float * f32_data;